'''
Created on Nov 8, 2011

@author: alebalbin
'''
from collections import defaultdict
def get_gene_candidates(file,field_name):
    '''
    field_name=gene_name
    '''
    ifile=open(file)
    hd=ifile.next().strip('\r\n').split('\t')
    ind=hd.index(field_name)
    genelist=[]
    for l in ifile:
        fields=l.strip('\r\n').split('\t')
        #print fields[ind].split(',')
        genelist.append(fields[ind].strip('"').split(','))
    ifile.close()
    return genelist

def read_expression_table(file):
    '''
    Returns a dictionary with
    key=genename,value=[location, length, rpkm_expression_value]
    '''
    genelist=defaultdict()
    ifile=open(file)
    hd=ifile.next().strip('\r\n').split('\t')
    
    for l in ifile:
        f=l.strip('\r\n').split('\t')
        genelist[f[0].strip('"')]=f[1:]
    ifile.close()
    return genelist

def main(snv_file, expression_file,sample_name):
    field_name='gene_name'
    candidates=get_gene_candidates(snv_file,field_name)
    expression=read_expression_table(expression_file)
    
    cand_exp=defaultdict()
    for clist in candidates:
        for c in clist:
            try:
                cand_exp[c] = expression[c]
            except KeyError:
                cand_exp[c]=['NAN']
    
    snv_table=open(snv_file)
    hd=snv_table.next().strip('\r\n').split('\t')
    ind=hd.index(field_name)
    ofile=open(snv_file.replace('.txt','_exp.tsv'),'w')
    #hd1=['locationHg18','length','RPKM']
    hd1=['RPKM']
    hd1=[sample_name+i for i in hd1]
    hd+=hd1
    ofile.write(",".join(hd).replace(',','\t')+'\n')
    for l in snv_table:
        fields=l.strip('\r\n').split('\t')
        genes=fields[ind].strip('"').split(',')
        gexp=[]
        
        for g in genes:
            gexp.append([cand_exp[g][-1]])
        #print sum(gexp,[]), type(",".join(sum(gexp,[]))),type(l.strip('\r\n'))
        print gexp
        ol=l.strip('\r\n')+'\t'+",".join(sum(gexp,[]))+'\n'
        #print gexp,ol
        ofile.write(ol)
        ##print l.strip('\r\n')
        #print g,gexp
        #print ",".join(sum(gexp,[])).replace(',','\t')
        
    snv_table.close()

#snv_file='/Users/alebalbin/Documents/projects/fmpn/SeattleSeqAnnotation131.FMPN63_gatk_sam_candidates_filtered_SeqtAnnot_Nov6.220631505048.txt'
snv_file='/Users/alebalbin/Documents/projects/fmpn/SeattleSeqAnnotation131.FMPN63_gatk_sam_candidates_filtered_SeqtAnnot_Nov6.220631505048_exp.txt'
expression_file='/Users/alebalbin/Documents/projects/fmpn/FMPN54/FMPN54.expression-rpkm.txt'
sample_name='fmpn51'
main(snv_file, expression_file,sample_name)

 
        